---
title: "R Notebook"
output:
  html_document:
    df_print: paged
---




#Data

Loading data from the UNGDC data


```{r message=FALSE}
#Loading packages and data
library(readtext)
library(quanteda)
library(rworldmap)
library(RColorBrewer)
library(haven)
library(readxl)

library(tidyverse)
library(tidymodels)
library(rsample)

# Modelling packages
library(caret)
library(caretEnsemble)
library(earth)
library(xgboost)
library(ranger)
library(rpart)
library(rpart.plot)

# Model interpretability packages
library(vip)
library(pdp)
library(lime)
library(jtools)
```



```{r}
total_counts <- read_csv("total_counts.csv") %>% filter(year>1999)
```



```{r}
model_data_imputed <- read_csv("model_data_imputed.csv") %>% 
  filter(country %in% total_counts$country) %>%
  left_join(total_counts, by = c("country", "year")) %>%
  arrange(country, year)

model_data_imputed$intersection_count <- model_data_imputed$intersection_count %>% replace_na(0)
model_data_imputed$climate_count <- model_data_imputed$climate_count %>% replace_na(0)
model_data_imputed$health_count <- model_data_imputed$health_count %>% replace_na(0)

model_data_imputed$SIDS <- factor(model_data_imputed$SIDS)
model_data_imputed$EU <- factor(model_data_imputed$EU)
model_data_imputed$G77 <- factor(model_data_imputed$G77)
model_data_imputed$African_Group <- factor(model_data_imputed$African_Group)
model_data_imputed$Arab_States <- factor(model_data_imputed$Arab_States)
model_data_imputed$EIG <- factor(model_data_imputed$EIG)
model_data_imputed$Umbrella_Group <- factor(model_data_imputed$Umbrella_Group)
model_data_imputed$LDCs <- factor(model_data_imputed$LDCs)

model_data_imputed$GDPpc <- log(model_data_imputed$GDPpc)


```





```{r}
model_data_imputed %>% 
  is.na() %>%
  reshape2::melt() %>%
  ggplot(aes(Var2, Var1, fill=value)) + 
    geom_raster() + 
    coord_flip() +
    scale_y_continuous(NULL, expand = c(0, 0)) +
    scale_fill_grey(name = "", 
                    labels = c("Present", 
                               "Missing")) +
    xlab("Observation") +
    theme(axis.text.y  = element_text(size = 6))
```

```{r}
library(psych)
as.data.frame(describe(model_data_imputed)) %>% rownames_to_column("Variables") %>% write_csv("summary_stats_ungd.csv")

```


################
# Count measures

## Decision Tree

```{r}
model_data_imputed_c <- model_data_imputed %>% select(-country)
```


```{r}
decision_tree <- rpart(
  formula = intersection_count ~ . - year - climate_count - health_count,
  data    = model_data_imputed_c, 
  cp = 0.007,
  method  = "poisson"
)


#decision_tree

#summary(decision_tree)

pdf("./UNGD_results/dt/decision_tree_intersection.pdf", width = 12, height = 5)

rpart.plot(decision_tree, type = 4, extra = 101, main = "Decision Tree: Intersection")

dev.off()

#vi_scores <- vi(decision_tree, method = "permute", target = "HES", metric = "rmse", pred_wrapper = predict)
vip(decision_tree, num_features = 25, bar = TRUE) + theme_bw() + ggtitle("Decision Tree Variable Importance: Intersection")

ggsave("./UNGD_results/dt/decision_tree_vip_intersection.pdf")

```



```{r}
decision_tree <- rpart(
  formula = climate_count ~ . - year - intersection_count - health_count,
  data    = model_data_imputed_c, 
  cp = 0.001,
  method  = "poisson"
)


#decision_tree

#summary(decision_tree)

pdf("./UNGD_results/dt/decision_tree_climate.pdf", width = 15, height = 5)

rpart.plot(decision_tree, type = 4, extra = 101, main = "Decision Tree: Climate")

dev.off()

#vi_scores <- vi(decision_tree, method = "permute", target = "HES", metric = "rmse", pred_wrapper = predict)
vip(decision_tree, num_features = 25, bar = TRUE) + theme_bw() + ggtitle("Decision Tree Variable Importance: Climate")

ggsave("./UNGD_results/dt/decision_tree_vip_climate.pdf")

```



```{r}
decision_tree <- rpart(
  formula = health_count ~ . - year - intersection_count - climate_count,
  data    = model_data_imputed_c, 
  cp = 0.02,
  method  = "poisson"
)


#decision_tree

#summary(decision_tree)

pdf("./UNGD_results/dt/decision_tree_health.pdf", width = 15, height = 5)

rpart.plot(decision_tree, type = 4, extra = 101, main = "Decision Tree: Health")

dev.off()

#vi_scores <- vi(decision_tree, method = "permute", target = "HES", metric = "rmse", pred_wrapper = predict)
vip(decision_tree, num_features = 25, bar = TRUE) + theme_bw() + ggtitle("Decision Tree Variable Importance: Health")

ggsave("./UNGD_results/dt/decision_tree_vip_health.pdf")

```






##Random Forest
### Intersection Analysis

```{r}

rf1 <- ranger(
  formula = intersection_count ~ . - year - country - climate_count - health_count,
  data    = model_data_imputed,
  mtry = floor(25/ 3),
  respect.unordered.factors = "order",
  seed = 123
)


# Baseline model
# get OOB RMSE
(default_rmse <- sqrt(rf1$prediction.error))
## [1] 0.7951457

```

```{r}

##TUNING PARAMETERS

n_features <- rf1$num.independent.variables

# create hyperparameter grid
hyper_grid <- expand.grid(
  mtry = floor(n_features * c(.05, .15, .25, .333, .4)),
  min.node.size = c(1, 3, 5, 10), 
  replace = c(TRUE, FALSE),                               
  sample.fraction = c(.5, .63, .8),                       
  rmse = NA                                               
)
```


```{r}
# execute full cartesian grid search
for(i in seq_len(nrow(hyper_grid))) {
  # fit model for ith hyperparameter combination
  fit <- ranger(
    formula = intersection_count ~ . - year - country - climate_count - health_count,
    data    = model_data_imputed, 
    num.trees       = n_features * 10,
    mtry            = hyper_grid$mtry[i],
    min.node.size   = hyper_grid$min.node.size[i],
    replace         = hyper_grid$replace[i],
    sample.fraction = hyper_grid$sample.fraction[i],
    verbose         = FALSE,
    seed            = 123,
    respect.unordered.factors = 'order',
  )
  # export OOB error 
  hyper_grid$rmse[i] <- sqrt(fit$prediction.error)
}
```

```{r}
# assess top 10 models
hyper_grid %>%
  arrange(rmse) %>%
  mutate(perc_gain = (default_rmse - rmse) / default_rmse * 100) %>%
  head(10)
```



#### RERUN THE MODEL WITH OPTIMAL HYPERPARAMETERS

##### re-run model with permutation-based variable importance

```{r}
rf_impurity <- ranger(
    formula = intersection_count ~ . - year - country - climate_count - health_count,
    data    = model_data_imputed,
  num.trees = 5000,
  mtry = 3,
  min.node.size = 10,
  sample.fraction = .5,
  replace = TRUE,
  importance = "impurity",
  respect.unordered.factors = "order",
  verbose = FALSE,
  seed  = 123
)

rf_permutation <- ranger(
    formula = intersection_count ~ . - year - country - climate_count - health_count,
    data    = model_data_imputed, 
  num.trees = 5000,
  mtry = 3,
  min.node.size = 10,
  sample.fraction = .5,
  replace = TRUE,
  importance = "permutation",
  respect.unordered.factors = "order",
  verbose = FALSE,
  seed  = 123
)
```

```{r}
rf_permutation
```

```{r}
enframe(rf_permutation$variable.importance) %>%
   unnest %>% 
  arrange(-value) %>%
  write_csv("random_forest_permutation_intersection.csv")
```


```{r}
vip(rf_impurity, num_features = n_features, bar = TRUE) + theme_bw() + ggtitle("Random Forest Impurity Importance: Intersection")

ggsave("./UNGD_results/rf/random_forest_impurity_intersection.pdf")

vip(rf_permutation, num_features = n_features, bar = TRUE) + theme_bw() 

ggsave("./UNGD_results/rf/random_forest_permutation_intersection.pdf")


```








## Health Count Analysis

```{r}

rf2 <- ranger(
  formula = health_count ~ . - year - country - climate_count - intersection_count,
  data    = model_data_imputed,
  mtry = floor(25/ 3),
  respect.unordered.factors = "order",
  seed = 123
)


# Baseline model
# get OOB RMSE
(default_rmse <- sqrt(rf2$prediction.error))
## [1] 2.425273

```

```{r}

##TUNING PARAMETERS

n_features <- rf2$num.independent.variables

# create hyperparameter grid
hyper_grid <- expand.grid(
  mtry = floor(n_features * c(.05, .15, .25, .333, .4)),
  min.node.size = c(1, 3, 5, 10), 
  replace = c(TRUE, FALSE),                               
  sample.fraction = c(.5, .63, .8),                       
  rmse = NA                                               
)
```


```{r}
# execute full cartesian grid search
for(i in seq_len(nrow(hyper_grid))) {
  # fit model for ith hyperparameter combination
  fit <- ranger(
    formula = health_count ~ . - year - country - climate_count - intersection_count,
    data    = model_data_imputed, 
    num.trees       = n_features * 10,
    mtry            = hyper_grid$mtry[i],
    min.node.size   = hyper_grid$min.node.size[i],
    replace         = hyper_grid$replace[i],
    sample.fraction = hyper_grid$sample.fraction[i],
    verbose         = FALSE,
    seed            = 123,
    respect.unordered.factors = 'order',
  )
  # export OOB error 
  hyper_grid$rmse[i] <- sqrt(fit$prediction.error)
}
```


```{r}
# assess top 10 models
hyper_grid %>%
  arrange(rmse) %>%
  mutate(perc_gain = (default_rmse - rmse) / default_rmse * 100) %>%
  head(10)
```



#### RERUN THE MODEL WITH OPTIMAL HYPERPARAMETERS

##### re-run model with permutation-based variable importance

```{r}
rf_impurity <- ranger(
    formula = health_count ~ . - year - country - climate_count - intersection_count,
    data    = model_data_imputed,
  num.trees = 5000,
  mtry = 3,
  min.node.size = 10,
  sample.fraction = .63,
  replace = FALSE,
  importance = "impurity",
  respect.unordered.factors = "order",
  verbose = FALSE,
  seed  = 123
)

rf_permutation <- ranger(
    formula = health_count ~ . - year - country - climate_count - intersection_count,
    data    = model_data_imputed, 
  num.trees = 5000,
  mtry = 3,
  min.node.size = 10,
  sample.fraction = .63,
  replace = FALSE,
  importance = "permutation",
  respect.unordered.factors = "order",
  verbose = FALSE,
  seed  = 123
)
```


```{r}
 vip(rf_impurity, num_features = n_features, bar = TRUE) + theme_bw() + ggtitle("Random Forest Impurity Importance: Health")
 
 ggsave("./UNGD_results/rf/random_forest_impurity_health.pdf")

vip(rf_permutation, num_features = n_features, bar = TRUE) + theme_bw() + ggtitle("Random Forest Permutation Importance: Health")

ggsave("./UNGD_results/rf/random_forest_permutation_health.pdf")


```












## Climate Change Count Analysis

```{r}

rf3 <- ranger(
  formula = climate_count ~ . - year - country - health_count - intersection_count,
  data    = model_data_imputed,
  mtry = floor(25/ 3),
  respect.unordered.factors = "order",
  seed = 123
)


# Baseline model
# get OOB RMSE
(default_rmse <- sqrt(rf3$prediction.error))
## [1] 2.806517

```

```{r}

##TUNING PARAMETERS

n_features <- rf3$num.independent.variables

# create hyperparameter grid
hyper_grid <- expand.grid(
  mtry = floor(n_features * c(.05, .15, .25, .333, .4)),
  min.node.size = c(1, 3, 5, 10), 
  replace = c(TRUE, FALSE),                               
  sample.fraction = c(.5, .63, .8),                       
  rmse = NA                                               
)
```


```{r}
# execute full cartesian grid search
for(i in seq_len(nrow(hyper_grid))) {
  # fit model for ith hyperparameter combination
  fit <- ranger(
    formula = climate_count ~ . - year - country - health_count - intersection_count,
    data    = model_data_imputed, 
    num.trees       = n_features * 10,
    mtry            = hyper_grid$mtry[i],
    min.node.size   = hyper_grid$min.node.size[i],
    replace         = hyper_grid$replace[i],
    sample.fraction = hyper_grid$sample.fraction[i],
    verbose         = FALSE,
    seed            = 123,
    respect.unordered.factors = 'order',
  )
  # export OOB error 
  hyper_grid$rmse[i] <- sqrt(fit$prediction.error)
}
```


```{r}
# assess top 10 models
hyper_grid %>%
  arrange(rmse) %>%
  mutate(perc_gain = (default_rmse - rmse) / default_rmse * 100) %>%
  head(10)
```



#### RERUN THE MODEL WITH OPTIMAL HYPERPARAMETERS

##### re-run model with permutation-based variable importance

```{r}
rf_impurity <- ranger(
    formula = climate_count ~ . - year - country - health_count - intersection_count,
    data    = model_data_imputed,
  num.trees = 5000,
  mtry = 6,
  min.node.size = 1,
  sample.fraction = .63,
  replace = FALSE,
  importance = "impurity",
  respect.unordered.factors = "order",
  verbose = FALSE,
  seed  = 123
)

rf_permutation <- ranger(
    formula = climate_count ~ . - year - country - health_count - intersection_count,
    data    = model_data_imputed, 
  num.trees = 5000,
  mtry = 6,
  min.node.size = 1,
  sample.fraction = .63,
  replace = FALSE,
  importance = "permutation",
  respect.unordered.factors = "order",
  verbose = FALSE,
  seed  = 123
)
```


```{r}
 vip(rf_impurity, num_features = n_features, bar = TRUE) + theme_bw() + ggtitle("Random Forest Impurity Importance: Climate Change")
 
 ggsave("./UNGD_results/rf/random_forest_impurity_climate.pdf")

vip(rf_permutation, num_features = n_features, bar = TRUE) + theme_bw() + ggtitle("Random Forest Permutation Importance: Climate Change")

ggsave("./UNGD_results/rf/random_forest_permutation_climate.pdf")


```


################
# Binary

```{r}
model_data_imputed_b <- model_data_imputed %>%
  add_column(intersection_binary=NA) %>% 
  add_column(climate_binary=NA) %>% 
  add_column(health_binary=NA)



model_data_imputed_b <- model_data_imputed_b %>%
  mutate(
    intersection_binary = case_when(
      intersection_count > 0 ~ 1,
      TRUE                 ~ 0
    )
  )

model_data_imputed_b <- model_data_imputed_b %>%
  mutate(
    climate_binary = case_when(
      climate_count > 0 ~ 1,
      TRUE                 ~ 0
    )
  )

model_data_imputed_b <- model_data_imputed_b %>%
  mutate(
    health_binary = case_when(
      health_count > 0 ~ 1,
      TRUE                 ~ 0
    )
  )

model_data_imputed_b$intersection_binary <- factor(model_data_imputed_b$intersection_binary)
model_data_imputed_b$climate_binary <- factor(model_data_imputed_b$climate_binary)
model_data_imputed_b$health_binary <- factor(model_data_imputed_b$health_binary)

```



## Decision Tree

```{r}
model_data_imputed_c <- subset(model_data_imputed_b, select = -c(year, country, climate_count, intersection_count, health_count))
```


```{r}
decision_tree <- rpart(
  formula = intersection_binary ~ . ,
  data    = subset(model_data_imputed_c, select = -c(climate_binary,health_binary)) , 
  cp = 0.001,
  method  = "class"
)


#decision_tree

#summary(decision_tree)

pdf("./UNGD_results/dt/decision_tree_intersection_classification.pdf", width = 12, height = 5)

rpart.plot(decision_tree, type = 4, extra = 101, main = "Classification Decision Tree: Intersection")

dev.off()

#vi_scores <- vi(decision_tree, method = "permute", target = "HES", metric = "rmse", pred_wrapper = predict)
vip(decision_tree, num_features = 25, bar = TRUE) + theme_bw() + ggtitle("Classification Decision Tree Variable Importance: Intersection")

ggsave("./UNGD_results/dt/decision_tree_vip_intersection_classification.pdf")

```


```{r}
decision_tree <- rpart(
  formula = health_binary  ~ . ,
  data    = subset(model_data_imputed_c, select = -c(intersection_binary, climate_binary)) , 
  cp = 0.001,
  method  = "class"
)


#decision_tree

#summary(decision_tree)

pdf("./UNGD_results/dt/decision_tree_health_classification.pdf", width = 15, height = 5)

rpart.plot(decision_tree, type = 4, extra = 101, main = "Classification Decision Tree: Health")

dev.off()

#vi_scores <- vi(decision_tree, method = "permute", target = "HES", metric = "rmse", pred_wrapper = predict)
vip(decision_tree, num_features = 25, bar = TRUE) + theme_bw() + ggtitle("Classification Decision Tree Variable Importance: Health")

ggsave("./UNGD_results/dt/decision_tree_vip_health_classification.pdf")

```



```{r}
decision_tree <- rpart(
  formula = climate_binary ~ . ,
  data    = subset(model_data_imputed_c, select = -c(intersection_binary,health_binary)) , 
  cp = 0.001,
  method  = "class"
)


#decision_tree

#summary(decision_tree)

pdf("./UNGD_results/dt/decision_tree_climate_classification.pdf", width = 15, height = 5)

rpart.plot(decision_tree, type = 4, extra = 101, main = "Classification Decision Tree: Climate")

dev.off()

#vi_scores <- vi(decision_tree, method = "permute", target = "HES", metric = "rmse", pred_wrapper = predict)
vip(decision_tree, num_features = 25, bar = TRUE) + theme_bw() + ggtitle("Classification Decision Tree Variable Importance: Climate")

ggsave("./UNGD_results/dt/decision_tree_vip_climate_classification.pdf")

```










##Random Forest
### Intersection Analysis

```{r}

rf1 <- ranger(
  formula = intersection_binary ~ . ,
  data    = subset(model_data_imputed_c, select = -c(climate_binary,health_binary)),
  mtry = floor(25/ 3),
  respect.unordered.factors = "order",
  seed = 123
)


# Baseline model
# get OOB RMSE
(default_rmse <- sqrt(rf1$prediction.error))
## [1] 0.7951457

```

```{r}

##TUNING PARAMETERS

n_features <- rf1$num.independent.variables

# create hyperparameter grid
hyper_grid <- expand.grid(
  mtry = floor(n_features * c(.05, .15, .25, .333, .4)),
  min.node.size = c(1, 3, 5, 10), 
  replace = c(TRUE, FALSE),                               
  sample.fraction = c(.5, .63, .8),                       
  rmse = NA                                               
)
```


```{r}
# execute full cartesian grid search
for(i in seq_len(nrow(hyper_grid))) {
  # fit model for ith hyperparameter combination
  fit <- ranger(
  formula = intersection_binary ~ . ,
  data    = subset(model_data_imputed_c, select = -c(climate_binary,health_binary)), 
    num.trees       = n_features * 10,
    mtry            = hyper_grid$mtry[i],
    min.node.size   = hyper_grid$min.node.size[i],
    replace         = hyper_grid$replace[i],
    sample.fraction = hyper_grid$sample.fraction[i],
    verbose         = FALSE,
    seed            = 123,
    respect.unordered.factors = 'order',
  )
  # export OOB error 
  hyper_grid$rmse[i] <- sqrt(fit$prediction.error)
}
```

```{r}
# assess top 10 models
hyper_grid %>%
  arrange(rmse) %>%
  mutate(perc_gain = (default_rmse - rmse) / default_rmse * 100) %>%
  head(10)
```



#### RERUN THE MODEL WITH OPTIMAL HYPERPARAMETERS

##### re-run model with permutation-based variable importance

```{r}
rf_impurity <- ranger(
    formula = intersection_count ~ . - year - country - climate_count - health_count,
    data    = model_data_imputed,
  num.trees = 5000,
  mtry = 3,
  min.node.size = 10,
  sample.fraction = .5,
  replace = FALSE,
  importance = "impurity",
  respect.unordered.factors = "order",
  verbose = FALSE,
  seed  = 123
)

rf_permutation <- ranger(
   formula = intersection_binary ~ . ,
  data    = subset(model_data_imputed_c, select = -c(climate_binary,health_binary)), 
  num.trees = 5000,
  mtry = 3,
  min.node.size = 10,
  sample.fraction = .5,
  replace = FALSE,
  importance = "permutation",
  respect.unordered.factors = "order",
  verbose = FALSE,
  seed  = 123
)
```


```{r}
vip(rf_impurity, num_features = n_features, bar = TRUE) + theme_bw() + ggtitle("Random Forest Impurity Importance: Intersection")

ggsave("./UNGD_results/rf/random_forest_impurity_intersection.pdf")

vip(rf_permutation, num_features = n_features, bar = TRUE) + theme_bw() + ggtitle("Classification Random Forest Permutation Importance: Intersection")

ggsave("./UNGD_results/rf/random_forest_permutation_intersection_classification.pdf")


```








## Health Analysis

```{r}

rf2 <- ranger(
  formula = health_binary  ~ . ,
  data    = subset(model_data_imputed_c, select = -c(intersection_binary, climate_binary)),
  mtry = floor(25/ 3),
  respect.unordered.factors = "order",
  seed = 123
)


# Baseline model
# get OOB RMSE
(default_rmse <- sqrt(rf2$prediction.error))
## [1] 2.425273

```

```{r}

##TUNING PARAMETERS

n_features <- rf2$num.independent.variables

# create hyperparameter grid
hyper_grid <- expand.grid(
  mtry = floor(n_features * c(.05, .15, .25, .333, .4)),
  min.node.size = c(1, 3, 5, 10), 
  replace = c(TRUE, FALSE),                               
  sample.fraction = c(.5, .63, .8),                       
  rmse = NA                                               
)
```


```{r}
# execute full cartesian grid search
for(i in seq_len(nrow(hyper_grid))) {
  # fit model for ith hyperparameter combination
  fit <- ranger(
  formula = health_binary  ~ . ,
  data    = subset(model_data_imputed_c, select = -c(intersection_binary, climate_binary)),
    num.trees       = n_features * 10,
    mtry            = hyper_grid$mtry[i],
    min.node.size   = hyper_grid$min.node.size[i],
    replace         = hyper_grid$replace[i],
    sample.fraction = hyper_grid$sample.fraction[i],
    verbose         = FALSE,
    seed            = 123,
    respect.unordered.factors = 'order',
  )
  # export OOB error 
  hyper_grid$rmse[i] <- sqrt(fit$prediction.error)
}
```


```{r}
# assess top 10 models
hyper_grid %>%
  arrange(rmse) %>%
  mutate(perc_gain = (default_rmse - rmse) / default_rmse * 100) %>%
  head(10)
```



#### RERUN THE MODEL WITH OPTIMAL HYPERPARAMETERS

##### re-run model with permutation-based variable importance

```{r}
rf_impurity <- ranger(
    formula = health_count ~ . - year - country - climate_count - intersection_count,
    data    = model_data_imputed,
  num.trees = 5000,
  mtry = 3,
  min.node.size = 10,
  sample.fraction = .5,
  replace = FALSE,
  importance = "impurity",
  respect.unordered.factors = "order",
  verbose = FALSE,
  seed  = 123
)

rf_permutation <- ranger(
  formula = health_binary  ~ . ,
  data    = subset(model_data_imputed_c, select = -c(intersection_binary, climate_binary)),
  num.trees = 5000,
  mtry = 3,
  min.node.size = 10,
  sample.fraction = .5,
  replace = FALSE,
  importance = "permutation",
  respect.unordered.factors = "order",
  verbose = FALSE,
  seed  = 123
)
```


```{r}
 vip(rf_impurity, num_features = n_features, bar = TRUE) + theme_bw() + ggtitle("Random Forest Impurity Importance: Health")
 
 ggsave("./UNGD_results/rf/random_forest_impurity_health.pdf")

vip(rf_permutation, num_features = n_features, bar = TRUE) + theme_bw() + ggtitle("Classification Random Forest Permutation Importance: Health")

ggsave("./UNGD_results/rf/random_forest_permutation_health_classification.pdf")


```







## Climate Change Analysis

```{r}

rf3 <- ranger(
  formula = climate_binary ~ . ,
  data    = subset(model_data_imputed_c, select = -c(intersection_binary,health_binary)),
  mtry = floor(25/ 3),
  respect.unordered.factors = "order",
  seed = 123
)


# Baseline model
# get OOB RMSE
(default_rmse <- sqrt(rf3$prediction.error))
## [1] 2.806517

```

```{r}

##TUNING PARAMETERS

n_features <- rf3$num.independent.variables

# create hyperparameter grid
hyper_grid <- expand.grid(
  mtry = floor(n_features * c(.05, .15, .25, .333, .4)),
  min.node.size = c(1, 3, 5, 10), 
  replace = c(TRUE, FALSE),                               
  sample.fraction = c(.5, .63, .8),                       
  rmse = NA                                               
)
```


```{r}
# execute full cartesian grid search
for(i in seq_len(nrow(hyper_grid))) {
  # fit model for ith hyperparameter combination
  fit <- ranger(
  formula = climate_binary ~ . ,
  data    = subset(model_data_imputed_c, select = -c(intersection_binary,health_binary)), 
    num.trees       = n_features * 10,
    mtry            = hyper_grid$mtry[i],
    min.node.size   = hyper_grid$min.node.size[i],
    replace         = hyper_grid$replace[i],
    sample.fraction = hyper_grid$sample.fraction[i],
    verbose         = FALSE,
    seed            = 123,
    respect.unordered.factors = 'order',
  )
  # export OOB error 
  hyper_grid$rmse[i] <- sqrt(fit$prediction.error)
}
```


```{r}
# assess top 10 models
hyper_grid %>%
  arrange(rmse) %>%
  mutate(perc_gain = (default_rmse - rmse) / default_rmse * 100) %>%
  head(10)
```



#### RERUN THE MODEL WITH OPTIMAL HYPERPARAMETERS

##### re-run model with permutation-based variable importance

```{r}
rf_impurity <- ranger(
    formula = climate_count ~ . - year - country - health_count - intersection_count,
    data    = model_data_imputed,
  num.trees = 5000,
  mtry = 6,
  min.node.size = 3,
  sample.fraction = .8,
  replace = TRUE,
  importance = "impurity",
  respect.unordered.factors = "order",
  verbose = FALSE,
  seed  = 123
)

rf_permutation <- ranger(
  formula = climate_binary ~ . ,
  data    = subset(model_data_imputed_c, select = -c(intersection_binary,health_binary)), 
  num.trees = 5000,
  mtry = 6,
  min.node.size = 3,
  sample.fraction = .8,
  replace = TRUE,
  importance = "permutation",
  respect.unordered.factors = "order",
  verbose = FALSE,
  seed  = 123
)
```


```{r}
 vip(rf_impurity, num_features = n_features, bar = TRUE) + theme_bw() + ggtitle("Random Forest Impurity Importance: Climate Change")
 
 ggsave("./UNGD_results/rf/random_forest_impurity_climate.pdf")

vip(rf_permutation, num_features = n_features, bar = TRUE) + theme_bw() + ggtitle("Classification Random Forest Permutation Importance: Climate Change")

ggsave("./UNGD_results/rf/random_forest_permutation_climate_classification.pdf")


```







